WHO wants a data driven approach which could help in suggesting a country which area should be given importance in order to efficiently improve the life expectancy of its population. Life_Expectancy_Data.csv file contains the data
Building a Prediction engine which predicts the life expectancy based on various features like status of the country, GDP, Alcohol consumption, adult mortality rate etc. To build a prediction engine we will use linear regression model.
Purpose: To perform regression analysis to study life expectancy in different countries
import numpy as np
import pandas as pd
pd.set_option('display.max_rows',800)
pd.set_option('display.max_columns',500)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# importing all libraries and dependencies for machine learning
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
import random
df = pd.read_csv("~/data/Life_Expectancy_Data.csv")
df.info()
df.describe()
df.head(5)
num_col = df.select_dtypes(include=np.number).columns
print("Numerica columns are as follows.\n",num_col)
cat_col = df.select_dtypes(exclude=np.number).columns
print("Categorical columns are as follows.\n",cat_col)
#Removing the extra space from column names.
#Strip will remove spaces before and after the column names,if any.
#Using Lambda helps scaling up to all the columns.
df = df.rename(columns = lambda x : x.strip())
#converting categorical features to numerical features for the machine to understand.
from sklearn import preprocessing
#Label encoder understands labels
label_encoder = preprocessing.LabelEncoder()
#Status "Developing" will be 1 and "Developed" will be 0
df['Status'] = label_encoder.fit_transform(df['Status'])
df.head()
#Knowing Missing values
print(df.isna().sum())
print(df.shape)
# Treating the NA values
# Replacing the NA value with Mean of the column
for i in df.columns.drop('Country'):
df[i].fillna(df[i].mean(),inplace = True)
#Checking the distribution of Y variable.
plt.figure(figsize=(8,8),dpi=80)
sns.boxplot(df['Life expectancy'])
plt.title('Life Exp Box plot')
plt.show()
plt.figure(figsize=(8,8))
plt.title('Life Exp Distribution plot')
sns.distplot(df['Life expectancy'])
Summary : The Y variable has few outliers and is almost linearlly distributed. Hence the Assumption of the linear regression holds true.
num_col = df.select_dtypes(include = np.number).columns
cat_col = df.select_dtypes(exclude = np.number).columns
print('Numerical columns are ',num_col)
print('categorical columns are ',cat_col)
#checking the multicolleaniarity of features by checking the correlation matrix
plt.figure(figsize=(16,16))
p = sns.heatmap(df[num_col].corr(), annot = True, cmap = 'RdYlGn', center = 0)
# To know the relation between different features
ax = sns.pairplot(df[num_col])
# Train test Split (70% data - Train, 30% data - Test)
X= df.drop(columns=['Life expectancy','Country'])
y=df[['Life expectancy']]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1234)
X_train1 = X_train['Income composition of resources']
# Adding a constant
X_train1 = sm.add_constant(X_train1)
# Creating a OLS model
model_1 = sm.OLS(y_train, X_train1).fit()
model_1.params
print(model_1.summary())
#Adding Schooling feature to the regression model
X_train2 = X_train[['Income composition of resources','Schooling']]
#Adding constant
X_train2 = sm.add_constant(X_train2)
model_2 = sm.OLS(y_train, X_train2).fit()
model_2.params
print(model_2.summary())
# adding 3rd feature in regression model
X_train3 = X_train[['Income composition of resources', 'Schooling', 'Adult Mortality']]
#adding a constant
X_train3 = sm.add_constant(X_train3)
#creating fitted model
model_3 = sm.OLS(y_train, X_train3).fit()
#checking the parameters
model_3.params
#Printing the model summary
print(model_3.summary())
#Running RFE with important columns(count 15 imp. features)
lm = LinearRegression()
lm.fit(X_train,y_train)
rfe = RFE(lm, 15)
rfe = rfe .fit(X_train, y_train)
list(zip(X_train.columns, rfe.support_,rfe.ranking_))
#Selecting important features basis support
imp_columns = X_train.columns[rfe.support_]
print(imp_columns)
#creating X_train dataframe with RFE selected variables
X_train_rfe = X_train[imp_columns]
After passing selected columns by RFE, manually evaluate each models p-value and VIF value. Until the acceptable range for p-values and VIF is found, drop the variables one at a time basis below criteria.
random.seed(0)
#Add constant
X_train_rfec = sm.add_constant(X_train_rfe)
#model with RFE features
lm_rfe = sm.OLS(y_train, X_train_rfec).fit()
print(lm_rfe.summary())
vif = pd.DataFrame()
vif['Features'] = X_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe.values, i) for i in range(X_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif
Dropping 'thinness 5 - 9 yrs' feature from training dataset as it has high p value.
X_train_rfe1 = X_train_rfe.drop(['thinness 5-9 years'],1,)
#add constant
X_train_rfe1b = sm.add_constant(X_train_rfe1)
#building model
lm_rfe1 = sm.OLS(y_train, X_train_rfe1b).fit()
#summary
print(lm_rfe1.summary())
#creating dataframe that contains feature names and its VIF
vif = pd.DataFrame()
vif['Features'] = X_train_rfe1.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe1.values,i) for i in range(X_train_rfe1.shape[1])]
vif['VIF'] = round(vif['VIF'],2)
vif= vif.sort_values(by ='VIF', ascending = False)
vif
Under five deaths has high VIF hence dropping that feature
X_train_rfe2 = X_train_rfe1.drop('under-five deaths',1,)
#adding constant
X_train_rfe2c = sm.add_constant(X_train_rfe2)
#building model
lm_rfe2 = sm.OLS(y_train, X_train_rfe2c).fit()
#summary
print(lm_rfe2.summary())
vif = pd.DataFrame()
vif['Features'] = X_train_rfe2.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe2.values,i) for i in range(X_train_rfe2.shape[1])]
vif['VIF'] = round(vif['VIF'],2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif
Since variable Alcohol has a very high P value eliminating that feature from the training dataset
#Dropping insignificant variable Alcohol
X_train_rfe3 = X_train_rfe2.drop('Alcohol',1,)
#adding constant
X_train_rfe3c = sm.add_constant(X_train_rfe3)
#model building
lm_rfe3 = sm.OLS(y_train, X_train_rfe3c).fit()
#summary
print(lm_rfe3.summary())
#VIF for all features
vif = pd.DataFrame()
vif['Features'] = X_train_rfe3.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe3.values,i) for i in range(X_train_rfe3.shape[1])]
vif['VIF'] = round(vif['VIF'],2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif
Dropping 'Schooling' Feature from training data set since it has high VIF
X_train_rfe4 = X_train_rfe3.drop('Schooling',1,)
#add a constant
X_train_rfe4c = sm.add_constant(X_train_rfe4)
#Model building
lm_rfe4 = sm.OLS(y_train, X_train_rfe4c).fit()
#Summary
print(lm_rfe4.summary())
vif = pd.DataFrame()
vif['Features'] = X_train_rfe4.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe4.values,i) for i in range(X_train_rfe4.shape[1])]
vif['VIF'] = round(vif['VIF'],2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif
'Diphtheria' has a high VIF value hence dropping the feature from the training set.
X_train_rfe5 = X_train_rfe4.drop('Diphtheria',1,)
#adding constant
X_train_rfe5c = sm.add_constant(X_train_rfe5)
#building model
lm_rfe5 = sm.OLS(y_train, X_train_rfe5c).fit()
#summary
print(lm_rfe5.summary())
vif = pd.DataFrame()
vif['Features'] = X_train_rfe5.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe5.values,i) for i in range(X_train_rfe5.shape[1])]
vif['VIF'] = round(vif['VIF'],2)
vif = vif.sort_values(by ='VIF', ascending = False)
vif
'Polio' has a high VIF value hence dropping the feature from the training set.
X_train_rfe6 = X_train_rfe5.drop('Polio',1,)
#adding constant
X_train_rfe6c = sm.add_constant(X_train_rfe6)
#building model
lm_rfe6 = sm.OLS(y_train, X_train_rfe6c).fit()
#summary
print(lm_rfe6.summary())
vif = pd.DataFrame()
vif['Features'] = X_train_rfe6.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe6.values,i) for i in range(X_train_rfe6.shape[1])]
vif['VIF'] = round(vif['VIF'],2)
vif = vif.sort_values(by ='VIF', ascending = False)
vif
'Hepatitis B' has a high VIF value hence dropping the feature from the training set.
X_train_rfe7 = X_train_rfe6.drop('Hepatitis B',1,)
#adding constant
X_train_rfe7c = sm.add_constant(X_train_rfe7)
#building model
lm_rfe7 = sm.OLS(y_train, X_train_rfe7c).fit()
#summary
print(lm_rfe7.summary())
vif = pd.DataFrame()
vif['Features'] = X_train_rfe7.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe7.values,i) for i in range(X_train_rfe7.shape[1])]
vif['VIF'] = round(vif['VIF'],2)
vif = vif.sort_values(by ='VIF', ascending = False)
vif
## By David Dale https://datascience.stackexchange.com/users/24162/david-dale
def stepwise_selection(X, y,
initial_list=[],
threshold_in=0.05,
threshold_out = 0.10,
verbose=True):
""" Perform a forward-backward feature selection
based on p-value from statsmodels.api.OLS
Arguments:
X - pandas.DataFrame with candidate features
y - list-like with the target
initial_list - list of features to start with (column names of X)
threshold_in - include a feature if its p-value < threshold_in
threshold_out - exclude a feature if its p-value > threshold_out
verbose - whether to print the sequence of inclusions and exclusions
Returns: list of selected features
Always set threshold_in < threshold_out to avoid infinite looping.
See https://en.wikipedia.org/wiki/Stepwise_regression for the details
"""
included = list(initial_list)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.idxmin()
included.append(best_feature)
changed=True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed=True
worst_feature = pvalues.argmax()
included.remove(worst_feature)
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
return included
result = stepwise_selection(X_train, y_train)
print('resulting features:')
print(result)
X_train_stepwise = X_train[['Schooling', 'Adult Mortality', 'HIV/AIDS', 'Diphtheria', 'BMI', 'Income composition of resources', 'Status', 'percentage expenditure', 'Polio', 'Measles', 'Hepatitis B', 'under-five deaths', 'infant deaths', 'thinness 1-19 years']]
# Adding constant
X_train_stepwise = sm.add_constant(X_train_stepwise)
# Building a model
lm_stepwise = sm.OLS(y_train, X_train_stepwise).fit()
# Summary
print(lm_stepwise.summary())
# Prediction with Training set
X_test_stepwise = X_test[['Schooling', 'Adult Mortality', 'HIV/AIDS', 'Diphtheria', 'BMI', 'Income composition of resources', 'Status', 'percentage expenditure', 'Polio', 'Measles', 'Hepatitis B', 'under-five deaths', 'infant deaths', 'thinness 1-19 years']]
X_test_stepwise = sm.add_constant(X_test_stepwise)
actual = y_test['Life expectancy']
prediction = lm_stepwise.predict(X_test_stepwise)
# Model Evaluation: Calculating Mean Squared Errors
model_mse = mean_squared_error(prediction, actual)
print(model_mse)
# Model Evaluation: Calculating Mean Absolute percentage Errors
def mape(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) *100
mape(actual,prediction)
# Linearity check
sns.scatterplot(y_test['Life expectancy'],prediction)
plt.title('Linearity Check')
plt.xlabel('Actual value')
plt.ylabel('Predicted value')
# Histogram to analyse error terms
fig = plt.figure()
sns.distplot((y_test['Life expectancy'] - prediction), bins = 20)
fig.suptitle('Error Term Analysis', fontsize = 20)
plt.xlabel('Errors', fontsize = 18)